library(dplyr)
library(modelr)
library(ggplot2)
library(skimr)
library(broom)
wages <- heights %>% filter(income > 0)

Wages data

wages %>% skim()
wages %>% 
  ggplot(aes(log(income))) + geom_histogram(binwidth = 0.25)

Linear models

mod_e <- lm(log(income) ~ education, data = wages)

Your Turn

Fit the model on the slide and then examine the output. What does it look like?

mod_e
## 
## Call:
## lm(formula = log(income) ~ education, data = wages)
## 
## Coefficients:
## (Intercept)    education  
##      8.5577       0.1418
summary(mod_e)
## 
## Call:
## lm(formula = log(income) ~ education, data = wages)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7893 -0.3563  0.1328  0.5798  2.9136 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.557691   0.073260  116.81   <2e-16 ***
## education   0.141840   0.005305   26.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9923 on 5262 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1196, Adjusted R-squared:  0.1195 
## F-statistic:   715 on 1 and 5262 DF,  p-value: < 2.2e-16
names(mod_e)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"
mod_e$coefficients
## (Intercept)   education 
##   8.5576906   0.1418404

Plotting models with base R

plot(mod_e)

plot(mod_e, which=c(1,2))

With ggplot2

ggplot(mod_e, aes(x=.fitted, y=.resid)) + geom_point() + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam'

ggplot(mod_e, aes(sample = .resid)) + geom_qq()

Broom

mod_e %>% tidy()
mod_e %>% glance()
mod_e %>% augment()
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
mod_e %>% augment(data = wages)
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead
## Warning: Deprecated: please use `purrr::possibly()` instead

Your turn

Model log(income) against height. Then use broom and dplyr functions to extract: The coefficient estimates and their related statistics The adj.r.squared and p.value for the overall model

mod_h <- lm(log(income) ~ height, data=wages)
mod_h %>%
  tidy()
mod_h %>%
  glance() %>%
  select(p.value, r.squared)

Multivariate regression

Your Turn

Model log(income) against education and height. Do the coefficients change?

mod_eh <- lm(log(income) ~ education + height, data = wages)

mod_eh %>% 
  tidy()

Your Turn

Model log(income) against education and height and sex. Can you interpret the coefficients?

mod_ehs <- lm(log(income) ~ education + height + sex, data = wages)

mod_ehs %>% 
  tidy()

Logistic regression

library(fivethirtyeight)
data(bechdel)
bechdel <- bechdel %>%
  mutate(pass = if_else(binary == "PASS", 0, 1))
mod_pass <- glm(pass~budget, data=bechdel, family=binomial)
summary(mod_pass)
## 
## Call:
## glm(formula = pass ~ budget, family = binomial, data = bechdel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9312  -1.2088   0.9046   1.1221   1.1955  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.248e-02  6.606e-02  -0.643     0.52    
## budget       5.797e-09  1.083e-09   5.354  8.6e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2467.3  on 1793  degrees of freedom
## Residual deviance: 2436.2  on 1792  degrees of freedom
## AIC: 2440.2
## 
## Number of Fisher Scoring iterations: 4

Your turn

Model pass against budget, year and domgross_2013. tidy() the model output.

mod_pass2 <- glm(pass~budget + year + domgross_2013, data=bechdel, family=binomial)

mod_pass2 %>%
  tidy()

Take Aways